Welcome!

To get these notes and the datasets we’ll be using, you’ll need to clone or download this repository: https://github.com/dsidavis/RGraphicsWorkshop

You’ll also need the ggplot2 library.

library(ggplot2)
## NB Some other libraries are loaded below. These are all optional.  It's good practice to put your dependencies at the top of your code.  

data_dir = 'data/'
plots_dir = 'plots/'

Base Graphics

Grammar of Graphics

Plot 0: Quick intro to ggplot

## Plot 0 ----
library(datasauRus)
## This is a fun collection of datasets:  They look totally different when they're plotted, but have the same mean, standard deviation, and Pearson correlation coefficient.  

str(datasaurus_dozen_wide, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame':    142 obs. of  26 variables:
##  $ away_x      : num  32.3 53.4 63.9 70.3 34.1 ...
##  $ away_y      : num  61.4 26.2 30.8 82.5 45.7 ...
##  $ bullseye_x  : num  51.2 59 51.9 48.2 41.7 ...
##  $ bullseye_y  : num  83.3 85.5 85.8 85 84 ...
##  $ circle_x    : num  56 50 51.3 51.2 44.4 ...
##  $ circle_y    : num  79.3 79 82.4 79.2 78.2 ...
##  $ dino_x      : num  55.4 51.5 46.2 42.8 40.8 ...
##  $ dino_y      : num  97.2 96 94.5 91.4 88.3 ...
##  $ dots_x      : num  51.1 50.5 50.2 50.1 50.6 ...
##  $ dots_y      : num  90.9 89.1 85.5 83.1 82.9 ...
##  $ h_lines_x   : num  53.4 52.8 47.1 42.4 42.7 ...
##  $ h_lines_y   : num  90.2 90.1 90.5 89.5 90.4 ...
##  $ high_lines_x: num  57.6 51.3 50.8 37 42.9 ...
##  $ high_lines_y: num  83.9 82.8 76.8 82 80.2 ...
##  $ slant_down_x: num  52.9 59 56.4 37.8 39.9 ...
##  $ slant_down_y: num  97.3 93.6 96.3 94.4 90.6 ...
##  $ slant_up_x  : num  47.7 44.6 43.9 41.6 49.2 ...
##  $ slant_up_y  : num  95.2 93.1 94.1 90.3 96.6 ...
##  $ star_x      : num  58.2 58.2 58.7 57.3 58.1 ...
##  $ star_y      : num  91.9 92.2 90.3 89.9 92 ...
##  $ v_lines_x   : num  50.5 50.3 50.2 50.3 50.5 ...
##  $ v_lines_y   : num  93.2 97.6 99.7 90 90 ...
##  $ wide_lines_x: num  65.8 65.7 39 37.8 35.5 ...
##  $ wide_lines_y: num  95.6 91.9 92.3 93.5 89.6 ...
##  $ x_shape_x   : num  38.3 35.8 32.8 33.7 37.2 ...
##  $ x_shape_y   : num  92.5 94.1 88.5 88.6 83.7 ...
str(datasaurus_dozen, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1846 obs. of  3 variables:
##  $ dataset: chr  "dino" "dino" "dino" "dino" ...
##  $ x      : num  55.4 51.5 46.2 42.8 40.8 ...
##  $ y      : num  97.2 96 94.5 91.4 88.3 ...
ggplot(datasaurus_dozen_wide, 
       aes(dino_x, dino_y)) +
    geom_point()

ggplot(datasaurus_dozen, 
       aes(x, y)) +
    geom_point() +
    facet_wrap(~ dataset)

Plot 1: Dumbbell plot

## Plot 1 ----
## Derived from figure 2: <https://www.frontiersin.org/articles/10.3389/frma.2018.00024/full>
counts_df = readRDS(paste0(data_dir, 'paper_counts.Rds'))

counts_df
## # A tibble: 27 x 3
##    subject_area   full sample
##    <fct>         <dbl>  <dbl>
##  1 MEDI         0.159  0.114 
##  2 BIOC         0.127  0.0851
##  3 AGRI         0.131  0.0723
##  4 ENGI         0.0582 0.0620
##  5 PHYS         0.0632 0.0574
##  6 CHEM         0.0401 0.0531
##  7 ENVI         0.0529 0.0498
##  8 MATE         0.0384 0.0483
##  9 SOCI         0.0344 0.0446
## 10 COMP         0.0324 0.0380
## # ... with 17 more rows
ggplot(counts_df, aes(subject_area)) + 
    ## geoms
    geom_point(aes(y = full, color = 'Full'), size = 3) +
    geom_point(aes(y = sample, color = 'Sample'), size = 3) +
    geom_segment(aes(x = subject_area, xend = subject_area, 
                     y = full, yend = sample), 
                 arrow = arrow(angle = 15, type = 'closed', 
                               length = unit(.025, 'snpc'))) +
    ## scales
    xlab('ASJC subject area') +
    scale_y_continuous(labels = scales::percent_format(), 
                       name = 'Papers in dataset') +
    scale_color_brewer(palette = 'Set1', 
                       name = 'Dataset') +
    coord_flip() + 
    ## annotations and theme
    ggtitle('Sampling improves balance across subject areas', 
            subtitle = Sys.time()) +
    annotate(geom = 'label', x = 'HEAL', y = .12, label = Sys.time()) +
    theme_minimal() +
    theme(legend.position = c(.8, .2), 
          legend.background = element_rect(fill = 'white'))

ggsave(paste0(plots_dir, 'plot_1.png'), height = 7, width = 7)

Discussion

  1. What design decisions are being used to highlight the message of this plot?
  2. What alternative designs could be used to communicate the same message?
    • Are any of these alternative designs better than the one I used?
    • Try implementing your idea in ggplot
  3. How does this plot omit, distort, or mislead?

Plot 2: High-density scatterplots

## Plot 2 ----
## Code to construct this dataset is in `climate_data.R`
temp_df = readRDS(paste0(data_dir, 'temp.Rds'))

str(temp_df)
## 'data.frame':    14577 obs. of  12 variables:
##  $ id           : chr  "USC00042294" "USC00042294" "USC00042294" "USC00042294" ...
##  $ name         : chr  "Davis" "Davis" "Davis" "Davis" ...
##  $ date         : Date, format: "2009-01-01" "2009-01-02" ...
##  $ year         : num  2009 2009 2009 2009 2009 ...
##  $ month        : chr  "January" "January" "January" "January" ...
##  $ day          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ temp_min     : num  3.3 3.9 0.6 -2.8 -2.2 1.7 0 3.9 0.6 -2.8 ...
##  $ temp_max     : num  6.1 6.7 10.6 11.1 10 7.2 12.8 7.2 10 16.7 ...
##  $ temp_delta   : num  2.8 2.8 10 13.9 12.2 5.5 12.8 3.3 9.4 19.5 ...
##  $ precipitation: num  0 0 0 0 5 8 0 0 0 0 ...
##  $ lat          : num  38.5 38.5 38.5 38.5 38.5 ...
##  $ lon          : num  -122 -122 -122 -122 -122 ...
ggplot(temp_df, aes(temp_max, temp_delta)) +
    ## geoms
    # geom_point() +
    # geom_count() +
    # geom_bin2d() +
    # geom_hex(aes(color = ..count..)) +
    # geom_density2d(size = 1) +
    # stat_density2d(contour = FALSE, geom = 'raster', aes(fill = ..density..), show.legend = FALSE) +
    stat_density2d(geom = 'polygon', aes(fill = ..level..), show.legend = FALSE) +
    ## facets
    facet_wrap(~ name) +
    ## Annotations are repeated within each facet
    # annotate(geom = 'label', x = 200, y = 5, label = Sys.time()) +
    ## scales
    xlab('Daily maximum temperature (ºC)') +
    ylab('Daily temperature difference (ºC)') +
    scale_color_viridis_c(option = 'A') +
    scale_fill_viridis_c(option = 'A') +
    ## themes
    theme_bw() +
    ggtitle('Temperature difference is associated with maximum daily temperature', 
            subtitle = Sys.time())
## Warning: Removed 104 rows containing non-finite values (stat_density2d).

ggsave(paste0(plots_dir, 'plot_2.png'), height = 7, width = 7)
## Warning: Removed 104 rows containing non-finite values (stat_density2d).

Discussion

  1. What are the strengths and weaknesses of each different option?

Plot 3: Time series + model predictions

## Plot 3 ----
## Hack to get month labels on day-as-year
## <https://stackoverflow.com/a/39861306/3187973>
temp_df$day.date = as.Date(temp_df$day, '2009-01-01')

## Model **for Sacramento alone**
model = lm(data = temp_df, temp_delta ~ day + I(day^2), 
           subset = name == 'Sacramento')

predictions = predict(model, interval = 'confidence', level = .95)
predictions = as.data.frame(predictions)
predictions$day = model$model$day
predictions$day.date = as.Date(predictions$day, '2009-01-01')

ggplot(temp_df, aes(day.date, temp_delta)) +
    ## geoms
    geom_path(aes(color = month, group = year), alpha = .3, show.legend = FALSE) +
    ## Separate models for each panel
    geom_smooth(method = lm, formula = y ~ x + I(x^2),
                color = 'red', size = 1) +
    ## Sacramento-only model
    ## Use the smooth geom to get curve + ribbon
    ## But set `stat = 'identity'` because we don't want to fit a (new) model
    geom_smooth(data = predictions, stat = 'identity',
                aes(y = fit, ymax = upr, ymin = lwr),
                color = 'blue', fill = 'blue', size = 1) +
    ## facets
    facet_wrap(~ name) +
    ## scales
    scale_x_date(name = 'Month', date_breaks = '3 month', date_labels = '%b') +  
    scale_y_continuous(name = 'Daily temperature difference (max - min)', 
                       labels = scales::math_format(expr = .x*degree*C)) +
    scale_color_viridis_d(limits = month.name,
                          option = 'E',
                          direction = -1) +
    # coord_polar() +
    ## themes
    theme_bw() +
    ggtitle('Sacramento has smaller temperature differences than other sites', 
            subtitle = Sys.time())
## Warning: Removed 104 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_path).

## Note effect of aspect ratios on perception
ggsave(paste0(plots_dir, 'plot_3.png'), height = 7, width = 7)
## Warning: Removed 104 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_path).
ggsave(paste0(plots_dir, 'plot_3a.png'), height = 7, width = 12)
## Warning: Removed 104 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_path).

Discussion

  1. What are the dangers of using models internal to the plot?
  2. Aspect ratios change our perception of the plot. Does this mean one aspect ratio is misleading?

Plot 4: Visualizing spatial data in R

## Plot 4 ----
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3

ggplot2 (version >=3) can plot vector spatial data from sf objects using geom_sf().

if (packageVersion('ggplot2') < package_version('3.0.0')) {
    stop('ggplot2 version 3 or later is required for this section')
}

This section looks at two different ways of adding basemaps under geom_sf() plots

stations = readRDS(paste0(data_dir, 'stations.Rds'))
stations.utm = readRDS(paste0(data_dir, 'stations.utm.Rds'))

stations
## Simple feature collection with 4 features and 4 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -122.1241 ymin: 38.4916 xmax: -119.9947 ymax: 38.8983
## epsg (SRID):    4326
## proj4string:    +proj=longlat +ellps=WGS84 +no_defs
## # A tibble: 4 x 5
##   name           ghcnd         lat   lon            geometry
##   <chr>          <chr>       <dbl> <dbl>         <POINT [°]>
## 1 Davis          USC00042294  38.5 -122. (-121.7761 38.5349)
## 2 Sacramento     USW00023271  38.6 -121. (-121.4183 38.5552)
## 3 Lake Tahoe     USW00093230  38.9 -120. (-119.9947 38.8983)
## 4 Lake Berryessa USC00045360  38.5 -122. (-122.1241 38.4916)

ggmap approach

library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## - Services other than Google Maps are deprecated
## - Requires Google account, API key
## - Doesn't seem to play nicely with coordinates other than lat/lon

## See ?register_google for links and instructions
## ***DO NOT SHARE YOUR API KEY***
# register_google(key = 'notarealkey', write = TRUE)
if (!has_google_key()) {
    stop('You need a Google Maps API key.  See ?register_google')
}

basemap = get_map(location = c(-121.4183, 38.5552), 
                  maptype = 'terrain',
                  zoom = 8)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=38.5552,-121.4183&zoom=8&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx
ggmap(basemap) +
    geom_sf(data = stations, inherit.aes = FALSE) +
    geom_sf_label(data = stations, 
                  aes(label = name), nudge_y = -.1, size = 3)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data

ggsave(paste0(plots_dir, 'plot_4_ggmap.png'), height = 7, width = 7)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data
## Example with UTM coordinates
# ggmap(basemap) +
#     geom_sf(data = stations.utm, inherit.aes = FALSE)

ggspatial approach

library(ggspatial)
## - annotation_map_tile() is slow, even with cached images
## See rosm::osm.types() for a list of basemap types
ggplot(data = stations) +
    annotation_map_tile(type = 'osm', zoom = 10) +
    geom_sf_label(data = stations, 
                  aes(label = name), nudge_y = .03, size = 3) +
    geom_sf() +
    annotation_scale(location = 'tl') +
    annotation_north_arrow(location = 'br', which_north = 'true', 
                           style = north_arrow_minimal)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data
## Loading required namespace: raster
## Zoom: 10

ggsave(paste0(plots_dir, 'plot_4_ggspatial.png'), height = 7, width = 7)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data
## Zoom: 10
## Example with UTM coordinates
# ggplot(data = stations.utm) +
#     annotation_map_tile(type = 'osm', zoom = 10,
#                         cachedir = system.file('rosm.cache', package = 'ggspatial')) +
#     geom_sf_label(data = stations,
#                   aes(label = name), nudge_y = 1000, size = 3) +
#     geom_sf()

  1. Strictly speaking, base is the name of a particular package; “base graphics” aren’t included in this package, but instead are in the graphics package. So “base graphics” is strictly incorrect. But “graphics graphics” sounds silly, and the graphics package is loaded automatically when you launch R, so everyone just calls it “base graphics.”